1 Importing libraries and dataset
Importing the libraries
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
Importing the dataset
library(ggplot2)
wine = read.csv("../dataset/wine-quality-white-and-red.csv")
wine[, 'type'] <- as.factor(wine[, 'type'])
str(wine)
## 'data.frame': 6497 obs. of 13 variables:
## $ type : Factor w/ 2 levels "red","white": 2 2 2 2 2 2 2 2 2 2 ...
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
nrow(wine)
## [1] 6497
ncol(wine)
## [1] 13
Getting the summary of the dataset
summary(wine)
## type fixed.acidity volatile.acidity citric.acid
## red :1599 Min. : 3.800 Min. :0.0800 Min. :0.0000
## white:4898 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median : 7.000 Median :0.2900 Median :0.3100
## Mean : 7.215 Mean :0.3397 Mean :0.3186
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.600 Min. :0.00900 Min. : 1.00 Min. : 6.0
## 1st Qu.: 1.800 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0
## Median : 3.000 Median :0.04700 Median : 29.00 Median :118.0
## Mean : 5.443 Mean :0.05603 Mean : 30.53 Mean :115.7
## 3rd Qu.: 8.100 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0
## Max. :65.800 Max. :0.61100 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50
## Median :0.9949 Median :3.210 Median :0.5100 Median :10.30
## Mean :0.9947 Mean :3.219 Mean :0.5313 Mean :10.49
## 3rd Qu.:0.9970 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30
## Max. :1.0390 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.818
## 3rd Qu.:6.000
## Max. :9.000
wine <- wine[!duplicated(wine), ]
nrow(wine)
## [1] 5320
set.seed(42)
rows <- sample(nrow(wine))
wine <- wine[rows, ]
nrow(wine)
## [1] 5320
Checking for null values
colSums(is.na(wine))
## type fixed.acidity volatile.acidity
## 0 0 0
## citric.acid residual.sugar chlorides
## 0 0 0
## free.sulfur.dioxide total.sulfur.dioxide density
## 0 0 0
## pH sulphates alcohol
## 0 0 0
## quality
## 0
Combining the red and white wine into a single dataset and removing the duplicates.
head(wine)
## type fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 white 6.5 0.240 0.38 1.0 0.027
## 5017 red 8.8 0.550 0.04 2.2 0.119
## 2875 white 5.4 0.230 0.36 1.5 0.030
## 6442 red 11.1 0.440 0.42 2.2 0.064
## 1301 white 9.0 0.245 0.38 5.9 0.045
## 1486 white 7.4 0.280 0.49 1.5 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 5017 14 56 0.99620 3.21 0.60 10.9
## 2875 74 121 0.98976 3.24 0.99 12.1
## 6442 14 19 0.99758 3.25 0.57 10.4
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## quality
## 3181 6
## 5017 6
## 2875 7
## 6442 6
## 1301 6
## 1486 6
2 EDA
Showing the quality of wines in white and red wines.
ggplot(wine, aes(x=quality, color=type)) +
geom_histogram(fill="white", position="dodge")+
scale_x_continuous(limits = c(3, 9), breaks = seq(3, 9, 1)) +
theme(legend.position="top")+
xlab('Quality of Wine') +
ylab('Number of Wines')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Secondly, we would like to see how each independent variables differ from white and red wine.
- fixed.acidity
ggplot(wine, aes(x=fixed.acidity, color=type)) +
geom_boxplot()
- volatile.acidity
ggplot(wine, aes(x=volatile.acidity, color=type)) +
geom_boxplot()
3.citric.acid
ggplot(wine, aes(x=citric.acid, color=type)) +
geom_boxplot()
4.residual.sugar
ggplot(wine, aes(x=residual.sugar, color=type)) +
geom_boxplot()
5.chlorides
ggplot(wine, aes(x=chlorides, color=type)) +
geom_boxplot()
6.free.sulfur.dioxide
ggplot(wine, aes(x=free.sulfur.dioxide, color=type)) +
geom_boxplot()
7.total.sulfur.dioxide
ggplot(wine, aes(x=total.sulfur.dioxide, color=type)) +
geom_boxplot()
8.density
ggplot(wine, aes(x=density, color=type)) +
geom_boxplot()
9.pH
ggplot(wine, aes(x=pH, color=type)) +
geom_boxplot()
10.sulphates
ggplot(wine, aes(x=sulphates, color=type)) +
geom_boxplot()
11.alcohol
ggplot(wine, aes(x=alcohol, color=type)) +
geom_boxplot()
From the boxplots, we can see that the variation is differed significantly in following variables: fixed.acidity residual.sugar total.sulfur.oxide free.sulfur.dioxide chlorides volatile.acidity
wine_noca = subset(wine, select = -c(type) )
wine_nocacor = cor(wine_noca )
corrplot(wine_nocacor,type="upper")
As we can see from the correlation diagram, the below variables are mostly corrleated to the quality. And we would like to dig further to see the effect quality vs alcohol quality vs density quality vs volatile.acidity quality vs chlorides
quality vs alcohol quality vs sulphates quality vs citric.acid quality vs volatile.acidity
We would anaysis in follwing steps: 1. normality test - qq plot & histagram 2. correlation test 3. scatter plot&boxplot- check the relationship 4. Annova test - compare means of our attribute across the wines and check if differences are statistically significant compared to the quality of a wine.
1.quality vs alcohol The distribution of alcohol data is right skewed
qqnorm(wine$alcohol, pch = 1, frame = FALSE)
qqline(wine$alcohol, col = "steelblue", lwd = 2)
ggplot(wine, aes(x=alcohol)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cor.test(wine$alcohol,wine$quality)
##
## Pearson's product-moment correlation
##
## data: wine$alcohol and wine$quality
## t = 38.769, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4482031 0.4901119
## sample estimates:
## cor
## 0.4694218
ggplot(wine, aes(x=alcohol, y=quality, color=type, shape = type)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle("alcohol vs Quality")
## `geom_smooth()` using formula = 'y ~ x'
wine_cat <- wine
wine_cat$quality <- factor(wine_cat$quality)
ggplot(wine_cat, aes(x=quality, y=alcohol, fill=type)) +
geom_boxplot() +
facet_wrap(~type)+
ggtitle("alcohol vs Quality")
anova_alcohol = aov(alcohol ~ quality, data=wine_cat)
summary(anova_alcohol)
## Df Sum Sq Mean Sq F value Pr(>F)
## quality 6 1933 322.1 308.4 <2e-16 ***
## Residuals 5313 5548 1.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2.quality vs density Density plot looks normally distributed
qqnorm(wine$density, pch = 1, frame = FALSE)
qqline(wine$density, col = "steelblue", lwd = 2)
ggplot(wine, aes(x=density)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cor.test(wine$density,wine$quality)
##
## Pearson's product-moment correlation
##
## data: wine$density and wine$quality
## t = -25.185, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3502348 -0.3022129
## sample estimates:
## cor
## -0.3264345
ggplot(wine, aes(x=density, y=quality, color=type, shape = type)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle("density vs Quality")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(wine_cat, aes(x=quality, y=density, fill=type)) +
geom_boxplot() +
facet_wrap(~type)+
ggtitle("density vs Quality")
anova_density = aov(density ~ quality, data=wine_cat)
summary(anova_density)
## Df Sum Sq Mean Sq F value Pr(>F)
## quality 6 0.00600 0.0010007 130.4 <2e-16 ***
## Residuals 5313 0.04077 0.0000077
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3.quality vs volatile.acidity The distribution of volatile.acidity data is almost normal,However, there is a small tail on the right side of the plot
qqnorm(wine$volatile.acidity, pch = 1, frame = FALSE)
qqline(wine$volatile.acidity, col = "steelblue", lwd = 2)
ggplot(wine, aes(x=volatile.acidity)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cor.test(wine$volatile.acidity,wine$quality)
##
## Pearson's product-moment correlation
##
## data: wine$volatile.acidity and wine$quality
## t = -20.058, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2900109 -0.2400432
## sample estimates:
## cor
## -0.2652051
ggplot(wine, aes(x=volatile.acidity, y=quality, color=type, shape = type)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle("volatile.acidity vs Quality")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(wine_cat, aes(x=quality, y=volatile.acidity, fill=type)) +
geom_boxplot() +
facet_wrap(~type)+
ggtitle("volatile.acidity vs Quality")
anova_volatile.acidity = aov(volatile.acidity ~ quality, data=wine_cat)
summary(anova_volatile.acidity)
## Df Sum Sq Mean Sq F value Pr(>F)
## quality 6 12.57 2.096 80.68 <2e-16 ***
## Residuals 5313 137.99 0.026
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
4.quality vs chlorides The distribution looks like normally distributed but is also right skewed
qqnorm(wine$density, pch = 1, frame = FALSE)
qqline(wine$density, col = "steelblue", lwd = 2)
ggplot(wine, aes(x=density)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cor.test(wine$density,wine$quality)
##
## Pearson's product-moment correlation
##
## data: wine$density and wine$quality
## t = -25.185, df = 5318, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3502348 -0.3022129
## sample estimates:
## cor
## -0.3264345
ggplot(wine, aes(x=chlorides,y=quality, color=type, shape = type)) +
geom_point() +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
ggtitle("chlorides vs Quality")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(wine_cat, aes(x=quality, y=chlorides, fill=type)) +
geom_boxplot() +
facet_wrap(~type)+
ggtitle("chlorides vs Quality")
anova_chlorides = aov(chlorides ~ quality, data=wine_cat)
summary(anova_chlorides)
## Df Sum Sq Mean Sq F value Pr(>F)
## quality 6 0.337 0.05623 43.35 <2e-16 ***
## Residuals 5313 6.891 0.00130
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
3 Modeling
for (xx in 1:(length(wine)-2) ) {
for (yy in (xx+1):(length(wine)-1) ) {
print(xx)
print(yy)
p <- ggplot(wine, aes(x=wine[,xx], y=wine[,yy], color=quality)) +
geom_point() +
labs( x = colnames(wine)[xx], y = colnames(wine)[yy], title = paste(colnames(wine)[yy],"vs",colnames(wine)[xx]) )
print(p)
}
}
## [1] 1
## [1] 2
## [1] 1
## [1] 3
## [1] 1
## [1] 4
## [1] 1
## [1] 5
## [1] 1
## [1] 6
## [1] 1
## [1] 7
## [1] 1
## [1] 8
## [1] 1
## [1] 9
## [1] 1
## [1] 10
## [1] 1
## [1] 11
## [1] 1
## [1] 12
## [1] 2
## [1] 3
## [1] 2
## [1] 4
## [1] 2
## [1] 5
## [1] 2
## [1] 6
## [1] 2
## [1] 7
## [1] 2
## [1] 8
## [1] 2
## [1] 9
## [1] 2
## [1] 10
## [1] 2
## [1] 11
## [1] 2
## [1] 12
## [1] 3
## [1] 4
## [1] 3
## [1] 5
## [1] 3
## [1] 6
## [1] 3
## [1] 7
## [1] 3
## [1] 8
## [1] 3
## [1] 9
## [1] 3
## [1] 10
## [1] 3
## [1] 11
## [1] 3
## [1] 12
## [1] 4
## [1] 5
## [1] 4
## [1] 6
## [1] 4
## [1] 7
## [1] 4
## [1] 8
## [1] 4
## [1] 9
## [1] 4
## [1] 10
## [1] 4
## [1] 11
## [1] 4
## [1] 12
## [1] 5
## [1] 6
## [1] 5
## [1] 7
## [1] 5
## [1] 8
## [1] 5
## [1] 9
## [1] 5
## [1] 10
## [1] 5
## [1] 11
## [1] 5
## [1] 12
## [1] 6
## [1] 7
## [1] 6
## [1] 8
## [1] 6
## [1] 9
## [1] 6
## [1] 10
## [1] 6
## [1] 11
## [1] 6
## [1] 12
## [1] 7
## [1] 8
## [1] 7
## [1] 9
## [1] 7
## [1] 10
## [1] 7
## [1] 11
## [1] 7
## [1] 12
## [1] 8
## [1] 9
## [1] 8
## [1] 10
## [1] 8
## [1] 11
## [1] 8
## [1] 12
## [1] 9
## [1] 10
## [1] 9
## [1] 11
## [1] 9
## [1] 12
## [1] 10
## [1] 11
## [1] 10
## [1] 12
## [1] 11
## [1] 12
3.1 KNN Algorithm
set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
wineknn<- data.frame(wine, qualityvariable)
wineknn <- wineknn[, -13]
wineknn <- wineknn[, -1]
head(wineknn)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 6.5 0.240 0.38 1.0 0.027
## 5017 8.8 0.550 0.04 2.2 0.119
## 2875 5.4 0.230 0.36 1.5 0.030
## 6442 11.1 0.440 0.42 2.2 0.064
## 1301 9.0 0.245 0.38 5.9 0.045
## 1486 7.4 0.280 0.49 1.5 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 5017 14 56 0.99620 3.21 0.60 10.9
## 2875 74 121 0.98976 3.24 0.99 12.1
## 6442 14 19 0.99758 3.25 0.57 10.4
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## qualityvariable
## 3181 high
## 5017 high
## 2875 high
## 6442 high
## 1301 high
## 1486 high
scaleddata <- as.data.frame(scale(wineknn[1:11], center = TRUE, scale = TRUE))
head(scaleddata)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 -0.5419372 -0.6189051 0.4179584 -0.8996256 -0.8054037
## 5017 1.2009219 1.2236103 -1.8924939 -0.6329697 1.6903024
## 2875 -1.3754784 -0.6783410 0.2820494 -0.7885190 -0.7240220
## 6442 2.9437810 0.5698145 0.6897763 -0.6329697 0.1983042
## 1301 1.3524749 -0.5891871 0.4179584 0.1892197 -0.3171134
## 1486 0.1400512 -0.3811611 1.1654576 -0.7885190 -0.6155130
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 3181 0.05410522 -0.4246473 -1.7788872 0.09562628 -1.1576984
## 5017 -0.90068036 -1.0235107 0.5613550 -0.09143040 0.4450484
## 2875 2.46915110 0.1213751 -1.6102818 0.09562628 3.0495120
## 6442 -0.90068036 -1.6752149 1.0267058 0.15797850 0.2447051
## 1301 1.23354623 0.7906929 0.1567022 -1.83729267 -1.2244795
## 1486 -0.56369721 0.2094432 -0.9223720 -1.52553155 -0.9573550
## alcohol
## 3181 1.47627132
## 5017 0.29576611
## 2875 1.30762772
## 6442 -0.12584289
## 1301 -0.29448649
## 1486 0.04280071
wine_sample <- sample(2, nrow(scaleddata), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- scaleddata[wine_sample==1, ]
wine_test <- scaleddata[wine_sample==2, ]
nrow(wine_test)
## [1] 1741
wine.trainLabels <- wineknn[wine_sample==1, 12]
wine.testLabels <- wineknn[wine_sample==2, 12]
length(wine.testLabels)
## [1] 1741
library(class)
library(caret)
## Loading required package: lattice
nrow(wine_training)
## [1] 3579
length(wine.trainLabels)
## [1] 3579
wine_model_pred <- knn(train = wine_training, test = wine_test, cl=wine.trainLabels, k=19)
library(gmodels)
crosst <- CrossTable(wine.testLabels, wine_model_pred, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1741
##
##
## | wine_model_pred
## wine.testLabels | high | low | Row Total |
## ----------------|-----------|-----------|-----------|
## high | 920 | 164 | 1084 |
## | 0.849 | 0.151 | 0.623 |
## | 0.767 | 0.303 | |
## | 0.528 | 0.094 | |
## ----------------|-----------|-----------|-----------|
## low | 280 | 377 | 657 |
## | 0.426 | 0.574 | 0.377 |
## | 0.233 | 0.697 | |
## | 0.161 | 0.217 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 1200 | 541 | 1741 |
## | 0.689 | 0.311 | |
## ----------------|-----------|-----------|-----------|
##
##
crosst
## $t
## y
## x high low
## high 920 164
## low 280 377
##
## $prop.row
## y
## x high low
## high 0.8487085 0.1512915
## low 0.4261796 0.5738204
##
## $prop.col
## y
## x high low
## high 0.7666667 0.3031423
## low 0.2333333 0.6968577
##
## $prop.tbl
## y
## x high low
## high 0.52843194 0.09419874
## low 0.16082711 0.21654222
cm = confusionMatrix(wine_model_pred, reference = as.factor(wine.testLabels) ) # from caret library
print( paste("Total Accuracy = ", cm$overall['Accuracy'] ) )
## [1] "Total Accuracy = 0.744974152785755"
chooseK = function(k, train_set, val_set, train_class, val_class){
# Build knn with k neighbors considered.
set.seed(1)
class_knn = knn(train = train_set, #<- training set cases
test = val_set, #<- test set cases
cl = train_class, #<- category for classification
k = k) #, #<- number of neighbors considered
# use.all = TRUE) #<- control ties between class assignments. If true, all distances equal to the k-th largest are included
tab = table(class_knn, val_class)
# Calculate the accuracy.
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
# The sapply() function plugs in several values into our chooseK function.
# function(x)[function] allows you to apply a series of numbers
# to a function without running a for() loop.
knn_different_k = sapply(seq(1, 30, by = 2), #<- set k to be odd number from 1 to 21
function(x) chooseK(x,
train_set = wine_training,
val_set =wine_test,
train_class =wine.trainLabels,
val_class =wine.testLabels ))
# Reformat the results to graph the results.
str(knn_different_k)
## num [1:2, 1:15] 1 0.674 3 0.706 5 ...
knn_different_k = data.frame(k = knn_different_k[1,],
accuracy = knn_different_k[2,])
knn_different_k
## k accuracy
## 1 1 0.6743251
## 2 3 0.7064905
## 3 5 0.7237220
## 4 7 0.7248708
## 5 9 0.7248708
## 6 11 0.7288914
## 7 13 0.7225732
## 8 15 0.7340609
## 9 17 0.7409535
## 10 19 0.7449742
## 11 21 0.7415279
## 12 23 0.7432510
## 13 25 0.7357840
## 14 27 0.7369328
## 15 29 0.7409535
# Plot accuracy vs. k.
# install.packages("ggplot2")
ggplot(knn_different_k,
aes(x = k, y = accuracy)) +
geom_line(color = "orange", linewidth = 1.5) +
geom_point(size = 3) +
labs(title = "accuracy vs k")
3.2 Decision Tree
set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, "high", "low")
winedc<- data.frame(wine, qualityvariable)
table(winedc$qualityvariable)
##
## high low
## 3332 1988
winedc <- winedc[, -13]
winedc <- winedc[, -1]
wine_sample_for_dc <- sample(2, nrow(winedc), replace=TRUE, prob=c(0.67, 0.33))
wine_training_for_dc <- winedc[wine_sample_for_dc==1, ]
nrow(wine_training_for_dc)
## [1] 3579
wine_test_for_dc <- winedc[wine_sample_for_dc==2, ]
nrow(wine_test_for_dc)
## [1] 1741
library(rpart)
library(rpart.plot)
control <- rpart.control(minsplit = 5L, maxdepth = 5L, minbucket = 5,cp=0.002, maxsurrogate = 4)
modeldc <- rpart(qualityvariable~., wine_training_for_dc, method = "class", control = control)
summary(modeldc)
## Call:
## rpart(formula = qualityvariable ~ ., data = wine_training_for_dc,
## method = "class", control = control)
## n= 3579
##
## CP nsplit rel error xerror xstd
## 1 0.155522164 0 1.0000000 1.0000000 0.02172342
## 2 0.141247183 1 0.8444778 0.8572502 0.02094598
## 3 0.008640120 2 0.7032307 0.7280240 0.01997208
## 4 0.007513148 5 0.6754320 0.7017280 0.01973910
## 5 0.006260957 9 0.6416228 0.6979715 0.01970479
## 6 0.003756574 12 0.6228400 0.6919609 0.01964936
## 7 0.003005259 13 0.6190834 0.6934636 0.01966328
## 8 0.002629602 16 0.6100676 0.7009767 0.01973226
## 9 0.002253944 20 0.5995492 0.6972201 0.01969790
## 10 0.002000000 22 0.5950413 0.6972201 0.01969790
##
## Variable importance
## alcohol density chlorides
## 28 15 13
## volatile.acidity total.sulfur.dioxide residual.sugar
## 13 10 9
## free.sulfur.dioxide sulphates citric.acid
## 6 3 1
## fixed.acidity pH
## 1 1
##
## Node number 1: 3579 observations, complexity param=0.1555222
## predicted class=high expected loss=0.3718916 P(node) =1
## class counts: 2248 1331
## probabilities: 0.628 0.372
## left son=2 (1832 obs) right son=3 (1747 obs)
## Primary splits:
## alcohol < 10.35 to the right, improve=239.59600, (0 missing)
## density < 0.99285 to the left, improve=134.90020, (0 missing)
## volatile.acidity < 0.4575 to the left, improve= 90.81570, (0 missing)
## chlorides < 0.0495 to the left, improve= 85.99684, (0 missing)
## citric.acid < 0.235 to the right, improve= 70.23665, (0 missing)
## Surrogate splits:
## density < 0.993395 to the left, agree=0.760, adj=0.508, (0 split)
## chlorides < 0.0415 to the left, agree=0.677, adj=0.339, (0 split)
## total.sulfur.dioxide < 139.5 to the left, agree=0.638, adj=0.258, (0 split)
## residual.sugar < 6.975 to the left, agree=0.626, adj=0.234, (0 split)
##
## Node number 2: 1832 observations, complexity param=0.007513148
## predicted class=high expected loss=0.1932314 P(node) =0.5118748
## class counts: 1478 354
## probabilities: 0.807 0.193
## left son=4 (668 obs) right son=5 (1164 obs)
## Primary splits:
## alcohol < 11.71667 to the right, improve=31.74577, (0 missing)
## free.sulfur.dioxide < 11.5 to the right, improve=24.90616, (0 missing)
## volatile.acidity < 0.5475 to the left, improve=18.42472, (0 missing)
## density < 0.99167 to the left, improve=17.23243, (0 missing)
## citric.acid < 0.255 to the right, improve=13.60497, (0 missing)
## Surrogate splits:
## density < 0.99087 to the left, agree=0.776, adj=0.386, (0 split)
## chlorides < 0.0335 to the left, agree=0.673, adj=0.103, (0 split)
## fixed.acidity < 5.45 to the left, agree=0.652, adj=0.046, (0 split)
## sulphates < 0.365 to the left, agree=0.648, adj=0.036, (0 split)
##
## Node number 3: 1747 observations, complexity param=0.1412472
## predicted class=low expected loss=0.4407556 P(node) =0.4881252
## class counts: 770 977
## probabilities: 0.441 0.559
## left son=6 (694 obs) right son=7 (1053 obs)
## Primary splits:
## volatile.acidity < 0.275 to the left, improve=87.28633, (0 missing)
## citric.acid < 0.235 to the right, improve=25.28865, (0 missing)
## chlorides < 0.0595 to the left, improve=23.97400, (0 missing)
## alcohol < 9.85 to the right, improve=22.56507, (0 missing)
## free.sulfur.dioxide < 24.5 to the right, improve=10.83186, (0 missing)
## Surrogate splits:
## chlorides < 0.0595 to the left, agree=0.693, adj=0.226, (0 split)
## free.sulfur.dioxide < 36.5 to the right, agree=0.657, adj=0.135, (0 split)
## residual.sugar < 7.85 to the right, agree=0.641, adj=0.095, (0 split)
## sulphates < 0.445 to the left, agree=0.638, adj=0.089, (0 split)
##
## Node number 4: 668 observations
## predicted class=high expected loss=0.07035928 P(node) =0.1866443
## class counts: 621 47
## probabilities: 0.930 0.070
##
## Node number 5: 1164 observations, complexity param=0.007513148
## predicted class=high expected loss=0.2637457 P(node) =0.3252305
## class counts: 857 307
## probabilities: 0.736 0.264
## left son=10 (848 obs) right son=11 (316 obs)
## Primary splits:
## free.sulfur.dioxide < 16.5 to the right, improve=23.181820, (0 missing)
## volatile.acidity < 0.4025 to the left, improve=16.839020, (0 missing)
## citric.acid < 0.255 to the right, improve= 9.986410, (0 missing)
## residual.sugar < 0.975 to the right, improve= 7.686083, (0 missing)
## sulphates < 0.635 to the right, improve= 7.114477, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 33.5 to the right, agree=0.851, adj=0.453, (0 split)
## chlorides < 0.0605 to the left, agree=0.771, adj=0.158, (0 split)
## fixed.acidity < 8.55 to the left, agree=0.765, adj=0.133, (0 split)
## volatile.acidity < 0.4425 to the left, agree=0.754, adj=0.095, (0 split)
##
## Node number 6: 694 observations, complexity param=0.00864012
## predicted class=high expected loss=0.3645533 P(node) =0.1939089
## class counts: 441 253
## probabilities: 0.635 0.365
## left son=12 (361 obs) right son=13 (333 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the left, improve=13.825610, (0 missing)
## alcohol < 9.55 to the right, improve= 7.818614, (0 missing)
## residual.sugar < 17.65 to the left, improve= 6.397418, (0 missing)
## sulphates < 0.485 to the right, improve= 5.381152, (0 missing)
## free.sulfur.dioxide < 23.5 to the right, improve= 5.369760, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 153.5 to the left, agree=0.641, adj=0.252, (0 split)
## density < 0.995215 to the left, agree=0.620, adj=0.207, (0 split)
## residual.sugar < 7.15 to the left, agree=0.610, adj=0.186, (0 split)
## free.sulfur.dioxide < 47.5 to the left, agree=0.591, adj=0.147, (0 split)
##
## Node number 7: 1053 observations, complexity param=0.006260957
## predicted class=low expected loss=0.3124406 P(node) =0.2942163
## class counts: 329 724
## probabilities: 0.312 0.688
## left son=14 (261 obs) right son=15 (792 obs)
## Primary splits:
## alcohol < 9.85 to the right, improve=8.837949, (0 missing)
## sulphates < 0.545 to the right, improve=7.746356, (0 missing)
## volatile.acidity < 0.555 to the left, improve=7.021819, (0 missing)
## fixed.acidity < 10.05 to the right, improve=6.366498, (0 missing)
## total.sulfur.dioxide < 51.5 to the left, improve=6.361199, (0 missing)
## Surrogate splits:
## density < 0.99273 to the left, agree=0.762, adj=0.038, (0 split)
## fixed.acidity < 5.1 to the left, agree=0.756, adj=0.015, (0 split)
## pH < 3.565 to the right, agree=0.756, adj=0.015, (0 split)
## volatile.acidity < 0.99 to the right, agree=0.754, adj=0.008, (0 split)
##
## Node number 10: 848 observations, complexity param=0.003005259
## predicted class=high expected loss=0.2028302 P(node) =0.2369377
## class counts: 676 172
## probabilities: 0.797 0.203
## left son=20 (709 obs) right son=21 (139 obs)
## Primary splits:
## volatile.acidity < 0.375 to the left, improve=8.183534, (0 missing)
## residual.sugar < 14.55 to the left, improve=5.237579, (0 missing)
## chlorides < 0.0395 to the left, improve=5.143133, (0 missing)
## citric.acid < 0.255 to the right, improve=4.576281, (0 missing)
## alcohol < 10.99 to the right, improve=4.322759, (0 missing)
## Surrogate splits:
## chlorides < 0.0635 to the left, agree=0.889, adj=0.324, (0 split)
## citric.acid < 0.155 to the right, agree=0.888, adj=0.317, (0 split)
## total.sulfur.dioxide < 65.5 to the right, agree=0.881, adj=0.273, (0 split)
## density < 0.99721 to the left, agree=0.857, adj=0.129, (0 split)
##
## Node number 11: 316 observations, complexity param=0.007513148
## predicted class=high expected loss=0.4272152 P(node) =0.08829282
## class counts: 181 135
## probabilities: 0.573 0.427
## left son=22 (156 obs) right son=23 (160 obs)
## Primary splits:
## sulphates < 0.575 to the right, improve=17.977220, (0 missing)
## chlorides < 0.0405 to the right, improve= 8.420782, (0 missing)
## residual.sugar < 1.675 to the right, improve= 8.367262, (0 missing)
## total.sulfur.dioxide < 36 to the left, improve= 6.198053, (0 missing)
## alcohol < 11.15 to the right, improve= 5.308257, (0 missing)
## Surrogate splits:
## density < 0.9949 to the right, agree=0.741, adj=0.474, (0 split)
## chlorides < 0.0595 to the right, agree=0.731, adj=0.455, (0 split)
## total.sulfur.dioxide < 36 to the left, agree=0.728, adj=0.449, (0 split)
## citric.acid < 0.395 to the right, agree=0.655, adj=0.301, (0 split)
##
## Node number 12: 361 observations, complexity param=0.002629602
## predicted class=high expected loss=0.2686981 P(node) =0.1008662
## class counts: 264 97
## probabilities: 0.731 0.269
## left son=24 (254 obs) right son=25 (107 obs)
## Primary splits:
## free.sulfur.dioxide < 26.5 to the right, improve=7.014382, (0 missing)
## residual.sugar < 5.45 to the right, improve=4.797638, (0 missing)
## pH < 2.995 to the right, improve=3.981735, (0 missing)
## fixed.acidity < 9.15 to the left, improve=3.931793, (0 missing)
## chlorides < 0.0335 to the right, improve=3.860880, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 103.5 to the right, agree=0.814, adj=0.374, (0 split)
## fixed.acidity < 8.95 to the left, agree=0.723, adj=0.065, (0 split)
## residual.sugar < 1.35 to the right, agree=0.717, adj=0.047, (0 split)
## density < 0.99193 to the right, agree=0.715, adj=0.037, (0 split)
##
## Node number 13: 333 observations, complexity param=0.00864012
## predicted class=high expected loss=0.4684685 P(node) =0.09304275
## class counts: 177 156
## probabilities: 0.532 0.468
## left son=26 (272 obs) right son=27 (61 obs)
## Primary splits:
## alcohol < 9.05 to the right, improve=7.232727, (0 missing)
## chlorides < 0.0495 to the left, improve=6.230406, (0 missing)
## citric.acid < 0.195 to the right, improve=5.214211, (0 missing)
## pH < 2.955 to the left, improve=4.607654, (0 missing)
## sulphates < 0.475 to the right, improve=4.486647, (0 missing)
## Surrogate splits:
## residual.sugar < 17.9 to the left, agree=0.829, adj=0.066, (0 split)
## density < 0.999725 to the left, agree=0.829, adj=0.066, (0 split)
## total.sulfur.dioxide < 229 to the left, agree=0.823, adj=0.033, (0 split)
##
## Node number 14: 261 observations, complexity param=0.006260957
## predicted class=low expected loss=0.4252874 P(node) =0.0729254
## class counts: 111 150
## probabilities: 0.425 0.575
## left son=28 (190 obs) right son=29 (71 obs)
## Primary splits:
## density < 0.99467 to the right, improve=4.849958, (0 missing)
## volatile.acidity < 0.555 to the left, improve=4.788846, (0 missing)
## sulphates < 0.545 to the right, improve=3.824913, (0 missing)
## pH < 3.425 to the left, improve=3.768881, (0 missing)
## fixed.acidity < 7.55 to the right, improve=3.652153, (0 missing)
## Surrogate splits:
## residual.sugar < 1.65 to the right, agree=0.828, adj=0.366, (0 split)
## chlorides < 0.0475 to the right, agree=0.785, adj=0.211, (0 split)
## fixed.acidity < 5.75 to the right, agree=0.762, adj=0.127, (0 split)
## volatile.acidity < 0.315 to the right, agree=0.762, adj=0.127, (0 split)
##
## Node number 15: 792 observations, complexity param=0.002629602
## predicted class=low expected loss=0.2752525 P(node) =0.2212909
## class counts: 218 574
## probabilities: 0.275 0.725
## left son=30 (434 obs) right son=31 (358 obs)
## Primary splits:
## volatile.acidity < 0.4225 to the left, improve=5.179714, (0 missing)
## fixed.acidity < 10.85 to the right, improve=4.327723, (0 missing)
## sulphates < 0.685 to the right, improve=4.186987, (0 missing)
## density < 1.00129 to the right, improve=2.771093, (0 missing)
## total.sulfur.dioxide < 41.5 to the left, improve=2.726417, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 111.5 to the right, agree=0.768, adj=0.486, (0 split)
## chlorides < 0.0635 to the left, agree=0.749, adj=0.444, (0 split)
## residual.sugar < 4.55 to the right, agree=0.707, adj=0.352, (0 split)
## free.sulfur.dioxide < 21.5 to the right, agree=0.705, adj=0.346, (0 split)
##
## Node number 20: 709 observations, complexity param=0.002253944
## predicted class=high expected loss=0.1720733 P(node) =0.1981
## class counts: 587 122
## probabilities: 0.828 0.172
## left son=40 (700 obs) right son=41 (9 obs)
## Primary splits:
## residual.sugar < 14.55 to the left, improve=4.459819, (0 missing)
## pH < 2.975 to the right, improve=4.071160, (0 missing)
## sulphates < 0.525 to the right, improve=4.016235, (0 missing)
## chlorides < 0.0385 to the left, improve=2.880302, (0 missing)
## fixed.acidity < 7.05 to the left, improve=2.665545, (0 missing)
##
## Node number 21: 139 observations, complexity param=0.003005259
## predicted class=high expected loss=0.3597122 P(node) =0.03883766
## class counts: 89 50
## probabilities: 0.640 0.360
## left son=42 (95 obs) right son=43 (44 obs)
## Primary splits:
## total.sulfur.dioxide < 123 to the left, improve=6.882365, (0 missing)
## residual.sugar < 5.45 to the left, improve=4.293256, (0 missing)
## alcohol < 10.65 to the right, improve=4.250181, (0 missing)
## chlorides < 0.0535 to the right, improve=4.228158, (0 missing)
## fixed.acidity < 7.45 to the right, improve=3.966908, (0 missing)
## Surrogate splits:
## free.sulfur.dioxide < 38.5 to the left, agree=0.784, adj=0.318, (0 split)
## residual.sugar < 4.125 to the left, agree=0.777, adj=0.295, (0 split)
## chlorides < 0.063 to the right, agree=0.763, adj=0.250, (0 split)
## sulphates < 0.515 to the right, agree=0.734, adj=0.159, (0 split)
##
## Node number 22: 156 observations, complexity param=0.003005259
## predicted class=high expected loss=0.2564103 P(node) =0.04358759
## class counts: 116 40
## probabilities: 0.744 0.256
## left son=44 (142 obs) right son=45 (14 obs)
## Primary splits:
## residual.sugar < 1.55 to the right, improve=4.593819, (0 missing)
## total.sulfur.dioxide < 76.5 to the left, improve=4.136950, (0 missing)
## alcohol < 11.25 to the right, improve=3.214031, (0 missing)
## chlorides < 0.0425 to the right, improve=3.152181, (0 missing)
## pH < 3.085 to the right, improve=3.152181, (0 missing)
## Surrogate splits:
## density < 0.992005 to the right, agree=0.942, adj=0.357, (0 split)
## chlorides < 0.0365 to the right, agree=0.936, adj=0.286, (0 split)
## volatile.acidity < 0.185 to the right, agree=0.917, adj=0.071, (0 split)
##
## Node number 23: 160 observations, complexity param=0.007513148
## predicted class=low expected loss=0.40625 P(node) =0.04470522
## class counts: 65 95
## probabilities: 0.406 0.594
## left son=46 (75 obs) right son=47 (85 obs)
## Primary splits:
## free.sulfur.dioxide < 11.5 to the right, improve=10.599260, (0 missing)
## volatile.acidity < 0.345 to the left, improve= 6.201674, (0 missing)
## citric.acid < 0.265 to the right, improve= 4.206731, (0 missing)
## pH < 3.365 to the left, improve= 4.167787, (0 missing)
## density < 0.99571 to the left, improve= 2.812500, (0 missing)
## Surrogate splits:
## volatile.acidity < 0.2975 to the left, agree=0.662, adj=0.280, (0 split)
## total.sulfur.dioxide < 29 to the right, agree=0.656, adj=0.267, (0 split)
## pH < 3.035 to the left, agree=0.613, adj=0.173, (0 split)
## citric.acid < 0.245 to the right, agree=0.581, adj=0.107, (0 split)
##
## Node number 24: 254 observations, complexity param=0.002253944
## predicted class=high expected loss=0.2047244 P(node) =0.07096954
## class counts: 202 52
## probabilities: 0.795 0.205
## left son=48 (249 obs) right son=49 (5 obs)
## Primary splits:
## pH < 2.88 to the right, improve=3.614686, (0 missing)
## citric.acid < 0.685 to the left, improve=2.296870, (0 missing)
## sulphates < 0.555 to the right, improve=2.135927, (0 missing)
## alcohol < 9.45 to the right, improve=1.868311, (0 missing)
## total.sulfur.dioxide < 123.5 to the left, improve=1.816450, (0 missing)
##
## Node number 25: 107 observations, complexity param=0.002629602
## predicted class=high expected loss=0.4205607 P(node) =0.02989662
## class counts: 62 45
## probabilities: 0.579 0.421
## left son=50 (40 obs) right son=51 (67 obs)
## Primary splits:
## residual.sugar < 2.95 to the right, improve=6.215204, (0 missing)
## fixed.acidity < 6.85 to the left, improve=5.831802, (0 missing)
## density < 0.9963 to the right, improve=3.926798, (0 missing)
## chlorides < 0.031 to the right, improve=2.855247, (0 missing)
## total.sulfur.dioxide < 99.5 to the left, improve=2.179469, (0 missing)
## Surrogate splits:
## density < 0.99421 to the right, agree=0.776, adj=0.400, (0 split)
## free.sulfur.dioxide < 24.5 to the right, agree=0.738, adj=0.300, (0 split)
## alcohol < 9.65 to the left, agree=0.692, adj=0.175, (0 split)
## chlorides < 0.0495 to the right, agree=0.664, adj=0.100, (0 split)
##
## Node number 26: 272 observations, complexity param=0.00864012
## predicted class=high expected loss=0.4191176 P(node) =0.07599888
## class counts: 158 114
## probabilities: 0.581 0.419
## left son=52 (164 obs) right son=53 (108 obs)
## Primary splits:
## chlorides < 0.0495 to the left, improve=7.604681, (0 missing)
## citric.acid < 0.195 to the right, improve=4.331373, (0 missing)
## total.sulfur.dioxide < 162.5 to the left, improve=4.199836, (0 missing)
## sulphates < 0.485 to the right, improve=4.196369, (0 missing)
## alcohol < 9.85 to the right, improve=3.544510, (0 missing)
## Surrogate splits:
## free.sulfur.dioxide < 14.5 to the right, agree=0.654, adj=0.130, (0 split)
## total.sulfur.dioxide < 107.5 to the right, agree=0.647, adj=0.111, (0 split)
## density < 0.99745 to the left, agree=0.643, adj=0.102, (0 split)
## residual.sugar < 4.85 to the right, agree=0.640, adj=0.093, (0 split)
##
## Node number 27: 61 observations, complexity param=0.003756574
## predicted class=low expected loss=0.3114754 P(node) =0.01704387
## class counts: 19 42
## probabilities: 0.311 0.689
## left son=54 (5 obs) right son=55 (56 obs)
## Primary splits:
## pH < 2.975 to the left, improve=5.163934, (0 missing)
## citric.acid < 0.425 to the left, improve=3.697769, (0 missing)
## residual.sugar < 13.05 to the right, improve=3.243676, (0 missing)
## total.sulfur.dioxide < 154.5 to the right, improve=2.346892, (0 missing)
## chlorides < 0.0575 to the right, improve=2.208152, (0 missing)
##
## Node number 28: 190 observations, complexity param=0.006260957
## predicted class=low expected loss=0.4842105 P(node) =0.05308745
## class counts: 92 98
## probabilities: 0.484 0.516
## left son=56 (107 obs) right son=57 (83 obs)
## Primary splits:
## volatile.acidity < 0.555 to the left, improve=8.614981, (0 missing)
## pH < 3.425 to the left, improve=5.423459, (0 missing)
## citric.acid < 0.035 to the right, improve=3.162573, (0 missing)
## density < 0.99496 to the left, improve=3.094336, (0 missing)
## sulphates < 0.725 to the right, improve=2.207081, (0 missing)
## Surrogate splits:
## citric.acid < 0.135 to the right, agree=0.732, adj=0.386, (0 split)
## chlorides < 0.0735 to the left, agree=0.726, adj=0.373, (0 split)
## residual.sugar < 6.15 to the right, agree=0.679, adj=0.265, (0 split)
## total.sulfur.dioxide < 93.5 to the right, agree=0.663, adj=0.229, (0 split)
##
## Node number 29: 71 observations
## predicted class=low expected loss=0.2676056 P(node) =0.01983794
## class counts: 19 52
## probabilities: 0.268 0.732
##
## Node number 30: 434 observations, complexity param=0.002629602
## predicted class=low expected loss=0.3271889 P(node) =0.1212629
## class counts: 142 292
## probabilities: 0.327 0.673
## left son=60 (19 obs) right son=61 (415 obs)
## Primary splits:
## fixed.acidity < 10.45 to the right, improve=5.065405, (0 missing)
## total.sulfur.dioxide < 60.5 to the left, improve=4.571476, (0 missing)
## sulphates < 0.675 to the right, improve=4.542000, (0 missing)
## alcohol < 9.516667 to the right, improve=3.625194, (0 missing)
## citric.acid < 0.255 to the right, improve=3.343132, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 18.5 to the left, agree=0.965, adj=0.211, (0 split)
##
## Node number 31: 358 observations
## predicted class=low expected loss=0.2122905 P(node) =0.1000279
## class counts: 76 282
## probabilities: 0.212 0.788
##
## Node number 40: 700 observations
## predicted class=high expected loss=0.1657143 P(node) =0.1955854
## class counts: 584 116
## probabilities: 0.834 0.166
##
## Node number 41: 9 observations
## predicted class=low expected loss=0.3333333 P(node) =0.002514669
## class counts: 3 6
## probabilities: 0.333 0.667
##
## Node number 42: 95 observations
## predicted class=high expected loss=0.2526316 P(node) =0.02654373
## class counts: 71 24
## probabilities: 0.747 0.253
##
## Node number 43: 44 observations
## predicted class=low expected loss=0.4090909 P(node) =0.01229394
## class counts: 18 26
## probabilities: 0.409 0.591
##
## Node number 44: 142 observations
## predicted class=high expected loss=0.2183099 P(node) =0.03967589
## class counts: 111 31
## probabilities: 0.782 0.218
##
## Node number 45: 14 observations
## predicted class=low expected loss=0.3571429 P(node) =0.003911707
## class counts: 5 9
## probabilities: 0.357 0.643
##
## Node number 46: 75 observations
## predicted class=high expected loss=0.4 P(node) =0.02095557
## class counts: 45 30
## probabilities: 0.600 0.400
##
## Node number 47: 85 observations
## predicted class=low expected loss=0.2352941 P(node) =0.02374965
## class counts: 20 65
## probabilities: 0.235 0.765
##
## Node number 48: 249 observations
## predicted class=high expected loss=0.1927711 P(node) =0.06957251
## class counts: 201 48
## probabilities: 0.807 0.193
##
## Node number 49: 5 observations
## predicted class=low expected loss=0.2 P(node) =0.001397038
## class counts: 1 4
## probabilities: 0.200 0.800
##
## Node number 50: 40 observations
## predicted class=high expected loss=0.2 P(node) =0.01117631
## class counts: 32 8
## probabilities: 0.800 0.200
##
## Node number 51: 67 observations
## predicted class=low expected loss=0.4477612 P(node) =0.01872031
## class counts: 30 37
## probabilities: 0.448 0.552
##
## Node number 52: 164 observations
## predicted class=high expected loss=0.3231707 P(node) =0.04582286
## class counts: 111 53
## probabilities: 0.677 0.323
##
## Node number 53: 108 observations
## predicted class=low expected loss=0.4351852 P(node) =0.03017603
## class counts: 47 61
## probabilities: 0.435 0.565
##
## Node number 54: 5 observations
## predicted class=high expected loss=0 P(node) =0.001397038
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 55: 56 observations
## predicted class=low expected loss=0.25 P(node) =0.01564683
## class counts: 14 42
## probabilities: 0.250 0.750
##
## Node number 56: 107 observations
## predicted class=high expected loss=0.3831776 P(node) =0.02989662
## class counts: 66 41
## probabilities: 0.617 0.383
##
## Node number 57: 83 observations
## predicted class=low expected loss=0.313253 P(node) =0.02319084
## class counts: 26 57
## probabilities: 0.313 0.687
##
## Node number 60: 19 observations
## predicted class=high expected loss=0.3157895 P(node) =0.005308745
## class counts: 13 6
## probabilities: 0.684 0.316
##
## Node number 61: 415 observations
## predicted class=low expected loss=0.3108434 P(node) =0.1159542
## class counts: 129 286
## probabilities: 0.311 0.689
predict_rpart <- predict(modeldc, wine_test_for_dc[, -13], type = "class")
prp(modeldc, type=2, extra=3, tweak=0.8, main = "The Quality of Wine", compress=TRUE)
plot(modeldc, uniform=TRUE, main="Classification Tree for Kyphosis")
text(modeldc, use.n=TRUE, all=TRUE, cex=.8)
rpart.plot(modeldc)
library(caret)
cm = confusionMatrix(predict_rpart, as.factor(wine_test_for_dc$qualityvariable))
print('Overall: ')
## [1] "Overall: "
cm$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.300402e-01 4.217100e-01 7.085237e-01 7.507786e-01 6.226307e-01
## AccuracyPValue McnemarPValue
## 1.826395e-21 3.327160e-01
print('Class: ')
## [1] "Class: "
cm$byClass
## Sensitivity Specificity Pos Pred Value
## 0.7933579 0.6255708 0.7775769
## Neg Pred Value Precision Recall
## 0.6472441 0.7775769 0.7933579
## F1 Prevalence Detection Rate
## 0.7853881 0.6226307 0.4939690
## Detection Prevalence Balanced Accuracy
## 0.6352671 0.7094644
#loadPkg("rpart")
#loadPkg("caret")
confusionMatrixResultDf = data.frame( Depth=numeric(0), Accuracy= numeric(0), Sensitivity=numeric(0), Specificity=numeric(0), Pos.Pred.Value=numeric(0), Neg.Pred.Value=numeric(0), Precision=numeric(0), Recall=numeric(0), F1=numeric(0), Prevalence=numeric(0), Detection.Rate=numeric(0), Detection.Prevalence=numeric(0), Balanced.Accuracy=numeric(0), row.names = NULL )
for (deep in 2:20) {
models <- rpart(qualityvariable~ ., wine_training_for_dc, method = "class",control = list(maxdepth = deep), cp = 0.002)
preds <- predict(models, wine_test_for_dc[, -13], type = "class")
cm = confusionMatrix(preds, as.factor(wine_test_for_dc$qualityvariable)) # from caret library
#
cmaccu = cm$overall['Accuracy']
# print( paste("Total Accuracy = ", cmaccu ) )
#
cmt = data.frame(Depth=deep, Accuracy = cmaccu, row.names = NULL ) # initialize a row of the metrics
cmt = cbind( cmt, data.frame( t(cm$byClass) ) ) # the dataframe of the transpose, with k valued added in front
confusionMatrixResultDf = rbind(confusionMatrixResultDf, cmt)
# print("Other metrics : ")
}
print(confusionMatrixResultDf)
## Depth Accuracy Sensitivity Specificity Pos.Pred.Value Neg.Pred.Value
## 1 2 0.7156806 0.8348708 0.5190259 0.7411957 0.6557692
## 2 3 0.7156806 0.8348708 0.5190259 0.7411957 0.6557692
## 3 4 0.7260195 0.8035055 0.5981735 0.7674009 0.6485149
## 4 5 0.7306146 0.7933579 0.6270928 0.7782805 0.6477987
## 5 6 0.7306146 0.8219557 0.5799087 0.7634961 0.6637631
## 6 7 0.7260195 0.8035055 0.5981735 0.7674009 0.6485149
## 7 8 0.7306146 0.8145756 0.5920852 0.7671590 0.6593220
## 8 9 0.7277427 0.8274908 0.5631659 0.7576014 0.6642729
## 9 10 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 10 11 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 11 12 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 12 13 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 13 14 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 14 15 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 15 16 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 16 17 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 17 18 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 18 19 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## 19 20 0.7254451 0.8311808 0.5509893 0.7533445 0.6642202
## Precision Recall F1 Prevalence Detection.Rate Detection.Prevalence
## 1 0.7411957 0.8348708 0.7852495 0.6226307 0.5198162 0.7013211
## 2 0.7411957 0.8348708 0.7852495 0.6226307 0.5198162 0.7013211
## 3 0.7674009 0.8035055 0.7850383 0.6226307 0.5002872 0.6519242
## 4 0.7782805 0.7933579 0.7857469 0.6226307 0.4939690 0.6346927
## 5 0.7634961 0.8219557 0.7916482 0.6226307 0.5117748 0.6703044
## 6 0.7674009 0.8035055 0.7850383 0.6226307 0.5002872 0.6519242
## 7 0.7671590 0.8145756 0.7901566 0.6226307 0.5071798 0.6611143
## 8 0.7576014 0.8274908 0.7910053 0.6226307 0.5152211 0.6800689
## 9 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 10 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 11 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 12 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 13 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 14 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 15 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 16 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 17 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 18 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## 19 0.7533445 0.8311808 0.7903509 0.6226307 0.5175187 0.6869615
## Balanced.Accuracy
## 1 0.6769484
## 2 0.6769484
## 3 0.7008395
## 4 0.7102254
## 5 0.7009322
## 6 0.7008395
## 7 0.7033304
## 8 0.6953283
## 9 0.6910851
## 10 0.6910851
## 11 0.6910851
## 12 0.6910851
## 13 0.6910851
## 14 0.6910851
## 15 0.6910851
## 16 0.6910851
## 17 0.6910851
## 18 0.6910851
## 19 0.6910851
library(rpart.plot)
rpart.plot(modeldc)
plotcp(modeldc)
prqualityfit <- prune(modeldc, cp = modeldc$cptable[which.min(modeldc$cptable[,"xerror"]),"CP"] )
# Compute the accuracy of the pruned tree
pred<- predict(prqualityfit, wine_test_for_dc[,-13], type = "class")
accuracy_prun <- mean(pred == as.factor(wine_test_for_dc$qualityvariable))
data.frame( accuracy_prun)
## accuracy_prun
## 1 0.7403791
rpart.plot(prqualityfit)
3.3 Logistic Regression
set.seed(1000)
qualityvariable <- ifelse(wine$quality > 5, 1, 0)
winelogit<- data.frame(wine, qualityvariable)
winelogit <- winelogit[, -c(1,13)]
#winelogit <- winelogit[, -1]
head(winelogit)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 6.5 0.240 0.38 1.0 0.027
## 5017 8.8 0.550 0.04 2.2 0.119
## 2875 5.4 0.230 0.36 1.5 0.030
## 6442 11.1 0.440 0.42 2.2 0.064
## 1301 9.0 0.245 0.38 5.9 0.045
## 1486 7.4 0.280 0.49 1.5 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 5017 14 56 0.99620 3.21 0.60 10.9
## 2875 74 121 0.98976 3.24 0.99 12.1
## 6442 14 19 0.99758 3.25 0.57 10.4
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## qualityvariable
## 3181 1
## 5017 1
## 2875 1
## 6442 1
## 1301 1
## 1486 1
wine_sample <- sample(2, nrow(winelogit), replace=TRUE, prob=c(0.67, 0.33))
wine_training <- winelogit[wine_sample==1, ]
head(wine_training)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 6.5 0.240 0.38 1.0 0.027
## 2875 5.4 0.230 0.36 1.5 0.030
## 1301 9.0 0.245 0.38 5.9 0.045
## 1486 7.4 0.280 0.49 1.5 0.034
## 2522 6.5 0.180 0.33 1.4 0.029
## 6415 6.1 0.320 0.25 2.3 0.071
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 2875 74 121 0.98976 3.24 0.99 12.1
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## 2522 35 138 0.99114 3.36 0.60 11.5
## 6415 23 58 0.99633 3.42 0.97 10.6
## qualityvariable
## 3181 1
## 2875 1
## 1301 1
## 1486 1
## 2522 1
## 6415 0
wine_test <- winelogit[wine_sample==2, ]
nrow(wine_test)
## [1] 1741
nrow(wine_training)
## [1] 3579
#head(winelogit)
#wine.trainLabels_logit <- winelogit[wine_sample==1, 12]
#length(wine.trainLabels_logit)
#wine.testLabels_logit <- winelogit[wine_sample==2, 12]
head(winelogit)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 6.5 0.240 0.38 1.0 0.027
## 5017 8.8 0.550 0.04 2.2 0.119
## 2875 5.4 0.230 0.36 1.5 0.030
## 6442 11.1 0.440 0.42 2.2 0.064
## 1301 9.0 0.245 0.38 5.9 0.045
## 1486 7.4 0.280 0.49 1.5 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 5017 14 56 0.99620 3.21 0.60 10.9
## 2875 74 121 0.98976 3.24 0.99 12.1
## 6442 14 19 0.99758 3.25 0.57 10.4
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## qualityvariable
## 3181 1
## 5017 1
## 2875 1
## 6442 1
## 1301 1
## 1486 1
corrlogit = cor(wine_training[,-12])
corrlogit
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 1.0000000 0.19671670 0.358316546 -0.1085421
## volatile.acidity 0.1967167 1.00000000 -0.384026118 -0.1603380
## citric.acid 0.3583165 -0.38402612 1.000000000 0.1489293
## residual.sugar -0.1085421 -0.16033804 0.148929286 1.0000000
## chlorides 0.2800980 0.35776885 0.067716597 -0.1199718
## free.sulfur.dioxide -0.2888181 -0.34165027 0.127150508 0.4233792
## total.sulfur.dioxide -0.3269967 -0.40388082 0.188901853 0.4975264
## density 0.4741359 0.30710155 0.108761858 0.5282271
## pH -0.2729064 0.26685772 -0.359684308 -0.2417892
## sulphates 0.3145093 0.22965406 0.075352942 -0.1710380
## alcohol -0.1023721 -0.06196878 -0.005754844 -0.3109122
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## fixed.acidity 0.28009798 -0.28881809 -0.32699667
## volatile.acidity 0.35776885 -0.34165027 -0.40388082
## citric.acid 0.06771660 0.12715051 0.18890185
## residual.sugar -0.11997177 0.42337919 0.49752645
## chlorides 1.00000000 -0.18672133 -0.26844202
## free.sulfur.dioxide -0.18672133 1.00000000 0.72691159
## total.sulfur.dioxide -0.26844202 0.72691159 1.00000000
## density 0.36326534 0.01793178 0.01076784
## pH 0.02438048 -0.15346995 -0.22802001
## sulphates 0.41322151 -0.21377492 -0.29265329
## alcohol -0.26190284 -0.18240184 -0.25132571
## density pH sulphates alcohol
## fixed.acidity 0.47413591 -0.27290636 0.31450933 -0.102372143
## volatile.acidity 0.30710155 0.26685772 0.22965406 -0.061968778
## citric.acid 0.10876186 -0.35968431 0.07535294 -0.005754844
## residual.sugar 0.52822707 -0.24178925 -0.17103796 -0.310912227
## chlorides 0.36326534 0.02438048 0.41322151 -0.261902845
## free.sulfur.dioxide 0.01793178 -0.15346995 -0.21377492 -0.182401841
## total.sulfur.dioxide 0.01076784 -0.22802001 -0.29265329 -0.251325707
## density 1.00000000 0.02424847 0.28498557 -0.662449545
## pH 0.02424847 1.00000000 0.15973535 0.105306308
## sulphates 0.28498557 0.15973535 1.00000000 -0.013009676
## alcohol -0.66244955 0.10530631 -0.01300968 1.000000000
library(corrplot)
corrplot(corrlogit,type="lower", method = "square")
logitmodel <- glm(qualityvariable ~ volatile.acidity + citric.acid +
residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol , data = wine_training, family= "binomial")
summary(logitmodel)
##
## Call:
## glm(formula = qualityvariable ~ volatile.acidity + citric.acid +
## residual.sugar + free.sulfur.dioxide + pH + sulphates + alcohol,
## family = "binomial", data = wine_training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2491 -0.9119 0.4425 0.8082 2.6592
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.763016 1.057595 -12.068 < 2e-16 ***
## volatile.acidity -4.044571 0.300817 -13.445 < 2e-16 ***
## citric.acid -0.205288 0.311280 -0.659 0.509577
## residual.sugar 0.031250 0.009970 3.135 0.001721 **
## free.sulfur.dioxide 0.007085 0.002679 2.645 0.008177 **
## pH 0.975925 0.287548 3.394 0.000689 ***
## sulphates 2.264890 0.318640 7.108 1.18e-12 ***
## alcohol 0.968095 0.044666 21.674 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4724 on 3578 degrees of freedom
## Residual deviance: 3719 on 3571 degrees of freedom
## AIC: 3735
##
## Number of Fisher Scoring iterations: 4
expcoeff = exp(coef(logitmodel))
expcoeff
## (Intercept) volatile.acidity citric.acid residual.sugar
## 2.864789e-06 1.751722e-02 8.144129e-01 1.031744e+00
## free.sulfur.dioxide pH sulphates alcohol
## 1.007110e+00 2.653620e+00 9.630068e+00 2.632925e+00
head(wine_training)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3181 6.5 0.240 0.38 1.0 0.027
## 2875 5.4 0.230 0.36 1.5 0.030
## 1301 9.0 0.245 0.38 5.9 0.045
## 1486 7.4 0.280 0.49 1.5 0.034
## 2522 6.5 0.180 0.33 1.4 0.029
## 6415 6.1 0.320 0.25 2.3 0.071
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 3181 31 90 0.98926 3.24 0.36 12.3
## 2875 74 121 0.98976 3.24 0.99 12.1
## 1301 52 159 0.99500 2.93 0.35 10.2
## 1486 20 126 0.99180 2.98 0.39 10.6
## 2522 35 138 0.99114 3.36 0.60 11.5
## 6415 23 58 0.99633 3.42 0.97 10.6
## qualityvariable
## 3181 1
## 2875 1
## 1301 1
## 1486 1
## 2522 1
## 6415 0
fitted.results <- predict(logitmodel,newdata=subset(wine_test,select=c(1,2,3,4,5,6,8,9,10,11)),type='response')
fitted.results_val <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results_val != wine_test$qualityvariable)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.729465824238943"
library(car)
## Loading required package: carData
vif(logitmodel)
## volatile.acidity citric.acid residual.sugar free.sulfur.dioxide
## 1.456212 1.363621 1.437728 1.432348
## pH sulphates alcohol
## 1.292089 1.231206 1.179129
length(fitted.results)
## [1] 1741
length(wine_test$qualityvariable)
## [1] 1741
confusionMatrix(as.factor(fitted.results_val), as.factor(wine_test$qualityvariable))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 354 168
## 1 303 916
##
## Accuracy : 0.7295
## 95% CI : (0.7079, 0.7502)
## No Information Rate : 0.6226
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4
##
## Mcnemar's Test P-Value : 6.642e-10
##
## Sensitivity : 0.5388
## Specificity : 0.8450
## Pos Pred Value : 0.6782
## Neg Pred Value : 0.7514
## Prevalence : 0.3774
## Detection Rate : 0.2033
## Detection Prevalence : 0.2998
## Balanced Accuracy : 0.6919
##
## 'Positive' Class : 0
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
wine_test$prob=fitted.results
h <- roc(qualityvariable~prob, data=wine_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(h) # area-under-curve prefer 0.8 or higher.
## Area under the curve: 0.8014
plot(h)
3.4 Feature Selection for Logistic Regression
library(leaps)
reg.leaps <- regsubsets(qualityvariable~ volatile.acidity + citric.acid +
residual.sugar + free.sulfur.dioxide + pH+ sulphates + alcohol, data = wine_training, nbest = 1, method = "exhaustive") # leaps,
plot(reg.leaps, scale = "adjr2", main = "Adjusted R^2")
plot(reg.leaps, scale = "bic", main = "BIC")
plot(reg.leaps, scale = "Cp", main = "Cp")
library(bestglm)
head(wine_test)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 5017 8.8 0.55 0.04 2.2 0.119
## 6442 11.1 0.44 0.42 2.2 0.064
## 752 6.9 0.20 0.50 10.0 0.036
## 5243 11.9 0.57 0.50 2.6 0.082
## 2299 6.3 0.41 0.18 3.5 0.027
## 1064 6.7 0.26 0.26 4.1 0.073
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 5017 14 56 0.99620 3.21 0.60 10.9
## 6442 14 19 0.99758 3.25 0.57 10.4
## 752 78 167 0.99640 3.15 0.55 10.2
## 5243 6 32 1.00060 3.12 0.78 10.7
## 2299 23 109 0.99018 3.34 0.54 12.8
## 1064 36 202 0.99560 3.30 0.67 9.5
## qualityvariable prob
## 5017 1 0.5538146
## 6442 1 0.5175007
## 752 1 0.7997941
## 5243 1 0.5306657
## 2299 1 0.9363355
## 1064 0 0.6106754
wine_test_1 = wine_test[,c(-7)]
head(wine_test_1)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 5017 8.8 0.55 0.04 2.2 0.119
## 6442 11.1 0.44 0.42 2.2 0.064
## 752 6.9 0.20 0.50 10.0 0.036
## 5243 11.9 0.57 0.50 2.6 0.082
## 2299 6.3 0.41 0.18 3.5 0.027
## 1064 6.7 0.26 0.26 4.1 0.073
## free.sulfur.dioxide density pH sulphates alcohol qualityvariable
## 5017 14 0.99620 3.21 0.60 10.9 1
## 6442 14 0.99758 3.25 0.57 10.4 1
## 752 78 0.99640 3.15 0.55 10.2 1
## 5243 6 1.00060 3.12 0.78 10.7 1
## 2299 23 0.99018 3.34 0.54 12.8 1
## 1064 36 0.99560 3.30 0.67 9.5 0
## prob
## 5017 0.5538146
## 6442 0.5175007
## 752 0.7997941
## 5243 0.5306657
## 2299 0.9363355
## 1064 0.6106754
res.bestglm <- bestglm(Xy = wine_test_1,
IC = "AIC", # Information criteria for
method = "exhaustive")
summary(res.bestglm)
## Fitting algorithm: AIC-leaps
## Best Model:
## df deviance
## Null Model 1729 4.122665
## Full Model 1740 105.525333
##
## likelihood-ratio test - GLM
##
## data: H0: Null Model vs. H1: Best Fit AIC-leaps
## X = 101.4, df = 11, p-value < 2.2e-16
res.bestglm$BestModels
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 TRUE TRUE TRUE TRUE TRUE
## 2 TRUE TRUE TRUE TRUE FALSE
## 3 TRUE TRUE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE FALSE
## 5 TRUE TRUE TRUE TRUE TRUE
## free.sulfur.dioxide density pH sulphates alcohol qualityvariable Criterion
## 1 TRUE TRUE TRUE TRUE TRUE TRUE -10503.59
## 2 TRUE TRUE TRUE TRUE TRUE TRUE -10503.58
## 3 TRUE FALSE TRUE TRUE TRUE TRUE -10491.27
## 4 TRUE FALSE TRUE TRUE TRUE TRUE -10487.81
## 5 TRUE TRUE TRUE TRUE TRUE FALSE -10486.94
4 Conclusion
Finally, we can see that the Logistic Regression model performed well on the dataset. Even though we only have a score of 73 %, the ROC was ~80.7 %, which makes this a pretty solid model. We can also see that in the KNN model, the accuracy was ~74%, but it is contradictory with the ROC score as it is not satisfactory. Hence, we didn’t take the accuracy of KNN into consideration.